import torch
import matplotlib.pyplot as plt
from IPython.display import Audio

#TODO: changeme
%cd /Users/janne/git/tutorial/codes
# from codes.data_loader import GTZANLoader
from utils import plot_spectrogram
/Users/janne/git/tutorial/codes

Audio Data Augmentations

In this chapter, we will discuss common transformations that we can apply to audio signals in the time domain. We will refer to these as “audio data augmentations”.

Data augmentations are a set of methods that add modified copies to a dataset, from the existing data. This process creates many variations of natural data, and can act as a regulariser to reduce the problem of overfitting. It can also help deep neural networks become robust to complex variations of natural data, which improves their generalisation performance.

In the field of computer vision, the transformations that we apply to images are often very self-explanatory. Take this image, for example. It becomes fairly obvious that we have applied various amounts of gaussian blurring on this image.

alt text

Naturally, we cannot translate transformations from the vision domain directly to the audio domain. Before we explore a battery of audio data augmentations, we now list the currently available code libraries:

Code Libraries

Name

Author

Framework

Language

License

Link

Muda

B. McFee et al. (2015)

General Purpose

Python

ISC License

source code

Audio Degradation Toolbox

M. Mauch et al. (2013)

General Purpose

MATLAB

GNU General Public License 2.0

source code

rubberband

-

General Purpose

C++

GNU General Public License (non-commercial)

website, pyrubberband

audiomentations

I. Jordal (2021)

General Purpose

Python

MIT License

source code

tensorflow-io

tensorflow.org

TensorFlow

Python

Apache 2.0 License

tutorial

torchaudio

pytorch.org

PyTorch

Python

BSD 2-Clause “Simplified” License

source code

torch-audiomentations

Asteroid (2021)

PyTorch

Python

MIT License

source code

torchaudio-augmentations

J. Spijkervet (2021)

PyTorch

Python

MIT License

source code

Listening

One of the most essential, and yet overlooked, parts of music research is exploring and observing the data. This also applies to data augmentation research: one has to develop a general understanding of the effect of transformations that can be applied to audio. Even more so, when transformations are applied sequentially.

For instance, we will understand why a reverb applied before a frequency filter will sound different than when the reverb is applied after the frequency filter. Before we develop this intuition, let’s listen to a series of audio data augmenations.

from torchaudio.datasets import GTZAN
dataset = GTZAN(root=".", download=True)
idx = 5
print(f"Number of datapoints in the GTZAN dataset: f{len(dataset)}\n")
print(f"Selected track no.: {idx}")
audio, sr, genre = dataset[idx]
print(f"Genre: {genre}\nSample rate: {sr}\nChannels: {audio.shape[0]}\nSamples: {audio.shape[1]}")
display(Audio(audio, rate=sr))
Number of datapoints in the GTZAN dataset: f1000

Selected track no.: 5
Genre: blues
Sample rate: 22050
Channels: 1
Samples: 661794

Random Crop

Similar to how we can crop an image, so that only a subset of the image is represented, we can ‘crop’ a piece of audio by selecting a fragment between two time points $t_0 - t_1$.

Various terms for this exist, e.g.,: slicing, trimming,

Frequency Filter

Note

In these examples and the accompanying code, we assume the shape of audio ordered in our array is follows: (channel, time)

from torch_audiomentations import LowPassFilter
taudio = LowPassFilter(
    sample_rate=sr,
    p=1.0,
    min_cutoff_freq=3000,
    max_cutoff_freq=3001,
)(audio.unsqueeze(0)).squeeze(0)

print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr)

print("LowPassFilter")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr)
Original
../_images/data-augmentation_8_2.png
LowPassFilter
../_images/data-augmentation_8_5.png

Delay

from torchaudio_augmentations import Delay
taudio = Delay(sample_rate=sr, min_delay=200, max_delay=201)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr)

print(f"Delay of {200}ms")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr)
Original
../_images/data-augmentation_10_2.png
Delay of 200ms
../_images/data-augmentation_10_5.png

Comb filter

When we apply a delayed signal to the original with a short timespan, it will cause interferences

from torchaudio_augmentations import Delay
taudio = Delay(sample_rate=sr, min_delay=60, max_delay=61)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr)

print(f"Delay of {61}ms")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr)
Original
../_images/data-augmentation_12_2.png
Delay of 61ms
../_images/data-augmentation_12_5.png

Pitch Shift

from torchaudio_augmentations import PitchShift

taudio = PitchShift(sample_rate=sr, n_samples=audio.shape[1], pitch_cents_min=4, pitch_cents_max=5)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")

print(f"Pitch shift of {4} semitones")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr, title="Pitch shift")
Original
../_images/data-augmentation_14_2.png
Pitch shift of 4 semitones
../_images/data-augmentation_14_5.png

Reverb

from torchaudio_augmentations import Reverb

taudio = Reverb(sample_rate=sr, reverberance_min=90, reverberance_max=91, room_size_min=90, room_size_max=91)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")

print(f"Reverb")
display(Audio(taudio, rate=sr))
plot_spectrogram(taudio, sr, title="Reverb")
Original
../_images/data-augmentation_16_2.png
Reverb
../_images/data-augmentation_16_5.png

Gain

Warning

In Jupyter notebook’s Audio() object, we have to set normalize=False so that we can hear an unnormalized version of the audio. This is important to reflect the true audio transformation output.

from torchaudio_augmentations import Gain

taudio = Gain(min_gain=-16, max_gain=-15)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")

print(f"Gain")
display(Audio(taudio, rate=sr, normalize=False))
plot_spectrogram(taudio, sr, title="Gain")
Original
../_images/data-augmentation_18_2.png
Gain
../_images/data-augmentation_18_5.png

Noise

from torchaudio_augmentations import Noise

taudio = Noise(min_snr=0.04, max_snr=0.04)(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")

print(f"Noise")
display(Audio(taudio, rate=sr, normalize=True))
plot_spectrogram(taudio, sr, title="Noise")
Original
../_images/data-augmentation_20_2.png
Noise
../_images/data-augmentation_20_5.png

Polarity Inversion

import math
l = 1/440.0
test_audio = torch.sin(math.tau * 440.0 * torch.linspace(0, l, int(l*sr))).unsqueeze(0)
plt.plot(test_audio.squeeze(0))
plt.grid()
plt.xticks([])
plt.show()

inverted_test_audio = PolarityInversion()(test_audio)
plt.plot(inverted_test_audio.squeeze(0))
plt.grid()
plt.xticks([])
plt.show()
../_images/data-augmentation_22_0.png ../_images/data-augmentation_22_1.png
from torchaudio_augmentations import PolarityInversion

taudio = PolarityInversion()(audio)
print("Original")
display(Audio(audio, rate=sr))
plot_spectrogram(audio, sr, title="Original")

print(f"Polarity Inversion")
display(Audio(taudio, rate=sr, normalize=True))
plot_spectrogram(taudio, sr, title="Polarity Inversion")

print(f"Original + Polarity Inversion")
display(Audio(audio + taudio, rate=sr, normalize=True))
plot_spectrogram(audio + taudio, sr, title="Original + Polarity Inversion")
Original
../_images/data-augmentation_23_2.png
Polarity Inversion
../_images/data-augmentation_23_5.png
Original + Polarity Inversion
../_images/data-augmentation_23_8.png

Sequential Audio Data Augmentations

Now that we have built up some intuition of some of the audio transformations, let us observe how they can be applied sequentially. More importantly, to develop an understanding on how different audio transformations interact when we apply them before, or after each other.

For this, we can use a Compose module, which takes as input a list of audio transformations. These will be applied in the order they appear in the supplied list. This interface is similar to torchvision.transforms and torchaudio.transformsCompose modules.

from torchaudio_augmentations import Compose, HighLowPass
transform = Compose([
    Delay(sample_rate=sr),
    HighLowPass(sample_rate=sr)
])
transformed_audio = transform(audio)
print("Original:")
display(Audio(audio, rate=sr))
print("Transform:", transform)
display(Audio(transformed_audio, rate=sr))
Original:
Transform: Compose(
	Delay()
	HighLowPass()
)

Now that we have listened to what a sequential audio transformation sounds like, let’s observe how two different transforms interact when they are applied in a different sequential order.

Let’s take the following two transforms:

  • Noise

  • Reverb

A signal that does not have any reverberation added, is commonly called a dry signal. A signal that is reverberated is called a wet signal.

When we first apply the Noise transform, the Reverb transform will apply the reverberation to the dry signal and the added noise signal. This will result in a completely wet signal.

Conversely, when we first apply the Reverb transform, the Noise signal will be added after the reverberated signal. The noise is thus dry, i.e., it is not reverberated.

from torchaudio_augmentations import Compose

noise = Noise(min_snr=0.05, max_snr=0.06)
reverb = Reverb(sample_rate=sr, reverberance_min=80, reverberance_max=81, dumping_factor_min=0, dumping_factor_max=1, room_size_min=80, room_size_max=81)

transform1 = Compose([noise, reverb])
transform2 = Compose([reverb, noise])

print("Transform 1:", transform1)
taudio1 = transform1(audio)
taudio2 = transform2(audio)

display(Audio(taudio1, rate=sr))
plot_spectrogram(taudio1, sr, title="Transform 1")

print("Transform:", transform2)
display(Audio(taudio2, rate=sr))
plot_spectrogram(taudio2, sr, title="Transform 2")
Transform 1: Compose(
	Noise()
	Reverb()
)
../_images/data-augmentation_27_2.png
Transform: Compose(
	Reverb()
	Noise()
)
../_images/data-augmentation_27_5.png

More Sequential Audio Data Augmentations

Let’s continue to develop our intuition for sequential audio transformations a bit more in the following examples:

# 4 seconds of audio
num_samples = sr * 4

transforms = [
    RandomResizedCrop(n_samples=num_samples),
    HighLowPass(
        sample_rate=sr,
        lowpass_freq_low=2200,
        lowpass_freq_high=4000,
        highpass_freq_low=200,
        highpass_freq_high=1200,
    ),
    Delay(
        sample_rate=sr,
        volume_factor=0.5,
        min_delay=100,
        max_delay=500,
        delay_interval=1,
    )
]
transform = Compose(transforms)

print("Transform:", transform)
transformed_audio = transform(audio)

display(Audio(transformed_audio, rate=sr))
Transform: Compose(
	RandomResizedCrop()
	HighLowPass()
	Delay()
)

Instead of retrieving a single augmented example, let’s return 4 different views of the original sound:

# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4

transform = ComposeMany(transforms, num_augmented_samples=num_augmented_samples)

print("Transform:", transform)
transformed_audio = transform(audio)
for ta in transformed_audio:
    plot_spectrogram(ta, sr, title="")
    display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
	RandomResizedCrop()
	HighLowPass()
	Delay()
)
../_images/data-augmentation_31_1.png
../_images/data-augmentation_31_3.png
../_images/data-augmentation_31_5.png
../_images/data-augmentation_31_7.png

Stochastic Audio Data Augmentations

transforms = [
    PolarityInversion(),
    PitchShift(sample_rate=sr, n_samples=audio.shape[1]),
    Reverb(sample_rate=sr)
]

stochastic_transforms = [
    RandomApply(transforms, p=0.5)
]
transform = Compose(stochastic_transforms)
print(transform)
transformed_audio = transform(audio)
display(Audio(transformed_audio, rate=sr))

Audio chain stochastic augmentations

from torchaudio_augmentations import RandomApply

# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4

# 4 seconds of audio
num_samples = sr * 4

stochastic_transforms = [
    RandomResizedCrop(n_samples=num_samples),
    
    # apply with p = 0.3
    RandomApply([
            PolarityInversion(),
            HighLowPass(
                sample_rate=sr,
                lowpass_freq_low=2200,
                lowpass_freq_high=4000,
                highpass_freq_low=200,
                highpass_freq_high=1200,
            ),
            Delay(
                sample_rate=sr,
                volume_factor=0.5,
                min_delay=100,
                max_delay=500,
                delay_interval=1,
            ),
        ], 
        p=0.3),
    
    # apply with p = 0.8
    RandomApply([
            PitchShift(sample_rate=sr, n_samples=num_samples),
            Gain(),
            Noise(max_snr=0.01),
            Reverb(sample_rate=sr)
        ],
        p=0.8)
]
transform = ComposeMany(stochastic_transforms, num_augmented_samples=num_augmented_samples)

print("Transform:", transform)

for ta in transformed_audio:
    display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
	RandomResizedCrop()
	RandomApply(
    p=0.3
    PolarityInversion()
    HighLowPass()
    Delay()
)
	RandomApply(
    p=0.8
    <torchaudio_augmentations.augmentations.pitch_shift.PitchShift object at 0x7fda773b7d60>
    Gain()
    Noise()
    Reverb()
)
)

Single stochastic augmentations

# we want 4 augmented samples from ComposeMany
num_augmented_samples = 4

# 4 seconds of audio
num_samples = sr * 4


# define our stochastic augmentations
transforms = [
    RandomResizedCrop(n_samples=num_samples),
    RandomApply([PolarityInversion()], p=0.8),
    RandomApply([HighLowPass(sample_rate=sr)], p=0.6),
    RandomApply([Delay(sample_rate=sr)], p=0.6),
    RandomApply([PitchShift(sample_rate=sr, n_samples=num_samples)], p=0.3),
    RandomApply([Gain()], p=0.6),
    RandomApply([Noise(max_snr=0.01)], p=0.3),
    RandomApply([Reverb(sample_rate=sr)], p=0.5)
]


transform = ComposeMany(transforms, num_augmented_samples=num_augmented_samples)

print("Transform:", transform)
transformed_audio = transform(audio)

for ta in transformed_audio:
    plot_spectrogram(ta, sr, title=e="")
    display(Audio(ta, rate=sr))
plt.show()
Transform: ComposeMany(
	RandomResizedCrop()
	RandomApply(
    p=0.8
    PolarityInversion()
)
	RandomApply(
    p=0.6
    HighLowPass()
)
	RandomApply(
    p=0.6
    Delay()
)
	RandomApply(
    p=0.3
    <torchaudio_augmentations.augmentations.pitch_shift.PitchShift object at 0x7fda7980f1f0>
)
	RandomApply(
    p=0.6
    Gain()
)
	RandomApply(
    p=0.3
    Noise()
)
	RandomApply(
    p=0.5
    Reverb()
)
)
../_images/data-augmentation_37_1.png
../_images/data-augmentation_37_3.png
../_images/data-augmentation_37_5.png
../_images/data-augmentation_37_7.png